/* 

This file creates a count of relevant employees in occupations related to each industry, 
at the state-county-year-naics level. The purpose of this is that there may not be
many citizens directly employed in an industry in a given county, but there may 
be employees in occupations that tend to be associated with an industry.

Strategy is:
1. Obtain top BLS occupation codes (SOC codes) for each industry (naics) code
2. Map SOC codes to ACS codes (and thus, naics codes to ACS codes)
3. Get count of employees in ACS codes, for each state-county-year, aggregate to
naics-county-year level using naics-ACS mapping

*/

*-------------------------------------------------------------------------------*
* Clean up industry codes and create naics-occupation code mapping

import excel "$datadir/raw/bls_emp_matrix_21.xlsx", clear
drop in 1
keep A B C E H
rename (A B C E H) (occ_type ind_type occ_code ind_code perc_ind)
destring perc_ind, replace

keep if occ_type == "Line item" // drop aggregated occupation codes

drop if substr(ind_code, 1, 2) == "TE"
drop if regexm(ind_code, ",|-")
assert length(ind_code) == 6

drop if substr(ind_code, 5, 2) != "00" & !regexm(ind_code, "A") // drop if naics5 or naics6 code

* There are some industry codes that have "A" in the title and correspond to multiple 
* naics codes. The linked file is a manual crosswalk to deal with those instances
preserve
	import excel "$datadir/crosswalk/ind_code_naics4_xwalk.xlsx", firstrow clear 
	save "${temp}/ind_code_naics4_xwalk", replace
restore

joinby ind_code using "${temp}/ind_code_naics4_xwalk", unmatched(master)
drop if _m == 1 & regexm(ind_code, "A")
drop _m

* create naics4-occ_code mapping
preserve
	gen naics4_str = substr(ind_code, 1, 4)
	destring naics4_str, gen(naics4_num)
	replace naics4 = naics4_num if missing(naics4)
	keep occ_code perc_ind naics4

	save "${temp}/emp_matrix_clean_naics4", replace
restore

* create naics3-occ_code mapping
preserve
	drop if substr(ind_code, 4, 3) != "000"
	
	gen naics3_str = substr(ind_code, 1, 3)
	destring naics3_str, gen(naics3_num)
	gen naics3 = naics3_num
	keep occ_code perc_ind naics3

	save "${temp}/emp_matrix_clean_naics3", replace
restore

*-------------------------------------------------------------------------------*
/* 
Get naics codes relevant to this project, and obtain the top occupation
codes (where n_occ_codes is the number of codes) for each naics code, at both
naics3 and naics4 level 
*/

local n_occ_codes 5 2

u $datadir/sample_year_naics, clear 

keep naics4 //don't need years 
duplicates drop 

* generate var for naics3
tostring naics4, replace
gen naics3 = substr(naics4, 1, 3)
destring naics3 naics4, replace

save "${temp}/naics3_naics4_unique", replace // unique combinations of naics3 and naics4 codes

* merge on naics to obtain occupation codes from bls employment matrix
foreach naics in naics3 naics4 {
	foreach n of local n_occ_codes {
		preserve
			keep `naics'
			duplicates drop
			joinby `naics' using "${temp}/emp_matrix_clean_`naics'"
			gsort `naics' -perc_ind
			
			bys `naics': keep if _n <= `n'
			save "${temp}/`naics'_occ_code_top_`n'", replace
		restore
	}
}

* indicate if there is any occupation code match for each naics
foreach naics in naics3 naics4 {
	preserve
		keep `naics'
		duplicates drop
		joinby `naics' using "${temp}/emp_matrix_clean_`naics'", unmatched(master)
		egen match = max(_m == 3), by(`naics')
		keep `naics' match
		duplicates drop
		save "${temp}/`naics'_match", replace
	restore
}

*-------------------------------------------------------------------------------*
/* Clean up occ2010-occ2018 (ACS occupation codes) crosswalk. The BLS employment
matrix uses occ2018 codes, and the Census data uses occ2010 codes, making this step 
necessary */

import excel "${datadir}/crosswalk/2018-occupation-code-list-and-crosswalk.xlsx", sheet("Occ Code Changes") cellrange(A4) clear
keep A C
rename (A C) (occ2010 occ2018)
carryforward occ2010, replace
carryforward occ2018, replace
destring occ2010, replace
destring occ2018, replace
save "${temp}/occ2010_2018_xwalk", replace

*-------------------------------------------------------------------------------*
* Create NAICS-occ2010 crosswalk

local n_occ_codes 5 2

* Load in occ_code-occ2018 crosswalk
import excel "${datadir}/crosswalk/nem-occcode-acs-crosswalk.xlsx", clear
drop in 1/5
keep B D
rename (B D) (occ_code occ2018)
destring occ2018, replace

* Get occ2010 code from occ2018 code
joinby occ2018 using "${temp}/occ2010_2018_xwalk", unmatched(master)
drop _m
replace occ2010 = occ2018 if missing(occ2010)

* Get NAICS codes from occ_codes; reduce so that crosswalk is unique at NAICS-occ2010 level
foreach naics in naics3 naics4 {
	foreach n of local n_occ_codes {
		preserve
			joinby occ_code using "${temp}/`naics'_occ_code_top_`n'"
			keep occ2010 `naics'
			duplicates drop
			save "${temp}/`naics'_occ2010_top_`n'", replace
		restore
	}
}

*-------------------------------------------------------------------------------*
* Obtain county-level employment in occupations related to industry. Final dataset
* is at the NAICS-State-County-Year level

*Prep for merge to create full panel of naics-year-county combinations
	use "${temp}/naics3_naics4_unique", clear
	gen tempmerge = 1
	save "${temp}/naics3_naics4_merge", replace
	

* Clean up census data and impute missing counties using PUMAs
use "$temp/census_raw", clear
drop metro strata gq proptx pernum race - educd 
replace countyfip = . if countyfip == 0
drop if countyfip == . & missing(puma)

drop if occ2010 == 9920 // unemployed

// impute missing fips codes from puma codes for 2011 and before
preserve
	keep if year <= 2011 & missing(countyfip) & !missing(puma)
	joinby statefip puma using "$datadir/crosswalk/puma2000_2_fips.dta", update
	save "${temp}/puma2000_fips_imputations", replace
restore

// impute missing fips codes from puma codes for 2012 and later
preserve
	keep if year > 2011 & missing(countyfip) & !missing(puma)
	joinby statefip puma using "$datadir/crosswalk/puma2010_2_fips.dta", update
	save "${temp}/puma2010_fips_imputations", replace
restore

// drop if fips codes are missing to avoid duplicates
drop if year <= 2011 & missing(countyfip) & !missing(puma)
drop if year > 2011 & missing(countyfip) & !missing(puma)

// add in fips imputations
append using "${temp}/puma2000_fips_imputations"
append using "${temp}/puma2010_fips_imputations"

assert !missing(countyfip)

// recalculate weights if person is assigned to multiple counnties
replace perwt = perwt / n_fips if !missing(n_fips)

preserve
	keep statefip countyfip year
	duplicates drop
	gen tempmerge = 1
	joinby tempmerge using "${temp}/naics3_naics4_merge"
	drop tempmerge
	
	egen id = group(statefip countyfip naics4 naics3)
	tsset id year
	tsfill, full
	
	foreach v of varlist statefip countyfip naics4 naics3 {
		egen `v'_temp = mode(`v'), by(id)
		replace `v' = `v'_temp
		drop `v'_temp
	}
	drop id
	save "${temp}/census_naics_full_panel", replace
restore
	
* get county-level employment counts by occupation
gen dummy = 1
gen dummy2 = 1 if incwage!= 0
assert incwage!=999999
collapse (sum) occ2010_incwage_tot = incwage (count) occ2010_emp_count = dummy occ2010_incwage_count = dummy2 [pweight=perwt], by(statefip countyfip occ2010 year)

* interpolate between years within county if no observations for an occupation
egen id = group(statefip countyfip occ2010)
tsset id year
tsfill, full

foreach v of varlist occ2010_emp_count occ2010_incwage_tot occ2010_incwage_count {
	ipolate `v' year, by(id) gen(temp) 
	replace `v' = temp
	drop temp
}

foreach v of varlist statefip countyfip occ2010 {
	egen temp = mode(`v'), by(id)
	replace `v' = temp
	drop temp
}
drop id

* get NAICS code from occupation code; collapse employment counts and wage income to NAICS-county level
local n_occ_codes 5 2
foreach naics in naics3 naics4 {
	foreach n of local n_occ_codes {
		preserve
			joinby occ2010 using "${temp}/`naics'_occ2010_top_`n'"
			collapse (sum) occ2010_incwage_tot occ2010_incwage_count `naics'_occ_emp_count_top_`n' = occ2010_emp_count, by(statefip countyfip `naics' year)
			gen `naics'_occ_incwage_top_`n' = occ2010_incwage_tot / occ2010_incwage_count
			drop occ2010_inc*
			save "${temp}/`naics'_occ_top_`n'", replace
		restore
	}
}

********************************************************************************
* merge all results into single dataset
use "${temp}/census_naics_full_panel", clear

local n_occ_codes 5 2
foreach naics in naics3 naics4 {
	foreach n of local n_occ_codes {
		merge m:1 `naics' statefip countyfip year using "${temp}/`naics'_occ_top_`n'", keep(1 3) nogen
	}
}

* occupation counts should be set to 0 if there is some occupation match for a NAICS code.
* occupation counts should be set to missing if there is no occupation match for a NAICS code.
* wage income should be set to 0 if there is some occupation match for a NAICS code.
* wage income should be set to missing if there is no occupation match for a NAICS code.
foreach naics in naics3 naics4 {
	merge m:1 `naics' using "${temp}/`naics'_match", nogen
	foreach n of local n_occ_codes {
		replace `naics'_occ_emp_count_top_`n' = 0 if match == 1 & missing(`naics'_occ_emp_count_top_`n')
		replace `naics'_occ_emp_count_top_`n' = . if match == 0 & `naics'_occ_emp_count_top_`n' == 0
		replace `naics'_occ_incwage_top_`n' = 0 if match == 1 & missing(`naics'_occ_incwage_top_`n')
		replace `naics'_occ_incwage_top_`n' = . if match == 0 & missing(`naics'_occ_incwage_top_`n')
	}
	drop match
}


save "${datadir}/naics4_occ_vars", replace

****************************************************************************************************

